library(readxl)
library(tidyverse)
library(patchwork)
library(kableExtra)
library(dplyr)
library(plotly)#MusicDataset <- read_excel("/Users/rileyvalashinas/MusicDataset.xlsx")
MusicDataset <- read_excel("/Users/sydneyball/Desktop/Data 412/project/MusicDataset.xlsx")Data found from Kaggle. This dataset provides a list of lyrics from 1950 to 2019 describing music metadata as sadness, danceability, loudness, acousticness, etc. Authors also provide some information as lyrics which can be used to natural language processing.
Citation: Moura, Luan; Fontelles, Emanuel; Sampaio, Vinicius; França, Mardônio (2020), “Music Dataset: Lyrics and Metadata from 1950 to 2019”, Mendeley Data, V3, doi: 10.17632/3t9vbwxgr5.3
#Top_Song <- read_excel("/Users/rileyvalashinas/Top Songs in the World.xlsx",
#skip = 1)
Top_Song <- read_excel("/Users/sydneyball/Desktop/Data 412/project/Top Songs in the World.xlsx",
skip = 1)Dataset was also found on Kaggle. “Top Songs of the World” is a collection of information about popular songs spanning various decades and genres. The dataset includes details such as the ranking of songs, the respective artists, titles, release years, sales figures, streaming statistics, download counts, radio play metrics, and a numerical rating.
Link to Top Songs in the World Dataset
## [1] 851160
## [1] 30
In the Music 1950-2019 there are 851160 original unique observations/rows within the data. There are also 31 columns or variables that are measured within the data.
## [1] "artist_name" "track_name"
## [3] "release_date" "genre"
## [5] "lyrics" "len"
## [7] "dating" "violence"
## [9] "world/life" "night/time"
## [11] "shake the audience" "family/gospel"
## [13] "romantic" "communication"
## [15] "obscene" "music"
## [17] "movement/places" "light/visual perceptions"
## [19] "family/spiritual" "like/girls"
## [21] "sadness" "feelings"
## [23] "danceability" "loudness"
## [25] "acousticness" "instrumentalness"
## [27] "valence" "energy"
## [29] "topic" "age"
Artists name = categorical (mostly unique values)
artist_count <- MusicDataset %>%
group_by(artist_name) %>%
summarise(Count = n()) %>%
arrange(desc(Count))
top_20_artists <- artist_count %>%
top_n(20, Count)
ggplot(top_20_artists, aes(x = reorder(artist_name, Count), y = Count)) +
geom_col(fill = "blue4") +
geom_text(aes(label = Count), position = position_dodge(width = 0.9), hjust = -0.2, size = 3) +
coord_flip() +
labs(title = "Top 20 Songs by Artists from 1950 to 2019",
x = "Artist",
y = "Count of Top Songs") +
theme_minimal()ggplot(MusicDataset, aes(x = release_date)) +
geom_histogram(binwidth = 1, fill = "skyblue2", color = "blue4") +
theme_minimal() +
labs(title = "Distribution of Song Releases Over Time",
x = "Release Year",
y = "Count of Songs")##
## blues country hip hop jazz pop reggae rock
## 4604 5445 904 3845 7042 2498 4034
##
## feelings music night/time obscene romantic sadness violence
## 612 2303 1825 4882 1524 6096 5710
## world/life
## 5420
The total break down for the Music Dataset from 1950 to 2019 is: - 23 decimal variables - 5 string variables - 3 integer variables
## [1] 38800
## [1] 8
In the Top Songs in the world dataset there are 38800 observations or top songs. There are also 8 columns or variables which the songs are measured by.
## [1] "Artist" "Title" "Year" "Sales" "Streams"
## [6] "Downloads" "Radio Plays" "Rating"
Artist = Includes all the names of artists in the top_songs
artist_counts_2 <- Top_Song %>%
group_by(Artist) %>%
summarise(Count = n()) %>%
arrange(desc(Count))
top_20_artists_2 <- artist_counts_2 %>%
top_n(20, Count)
ggplot(top_20_artists_2, aes(x = reorder(Artist, Count), y = Count)) +
geom_col(fill = "skyblue2") +
geom_text(aes(label = Count), position = position_dodge(width = 0.9), hjust = -0.2, size = 3) +
coord_flip() +
labs(title = "Top 20 Artists by Number of Top Songs",
x = "Artist",
y = "Count of Top Songs") +
theme_minimal()ggplot(Top_Song, aes(x = Year)) +
geom_histogram(binwidth = 1, fill = "skyblue2", color = "blue4") +
theme_minimal() +
labs(title = "Distribution of Top Songs Releases Over Time",
x = "Release Year",
y = "Count of Songs")max_year <- max(Top_Song$Year, na.rm = TRUE)
min_year <- min(Top_Song$Year, na.rm = TRUE)
print(paste("The range of years is from", min_year, "to", max_year))## [1] "The range of years is from 1901 to 2014"
#Convert columns into proper variable category
MusicDataset$release_date <- as.Date(MusicDataset$release_date, format = "%Y")
#Get rid of any missing values
MusicDataset <- na.omit(MusicDataset)
#Remove any and all duplicates
MusicDataset <- MusicDataset[!duplicated(MusicDataset), ]
#Standardize all text in the data set
MusicDataset$genre <- tolower(MusicDataset$genre)#Standardize the column names to ensure names are consistent and descriptive
colnames(Top_Song) <- c("Artist", "Title", "Year", "Sales", "Streams", "Downloads", "RadioPlays", "Rating")
#Make sure the variable 'Year' is numeric
Top_Song$Year <- as.numeric(Top_Song$Year)
#Remove any of the missing values in the data set
Top_Song <- na.omit(Top_Song)
Top_Song$Sales <- ifelse(is.na(Top_Song$Sales), mean(Top_Song$Sales, na.rm = TRUE), Top_Song$Sales)
#Remove any possible duplicates from the data
Top_Song <- na.omit(Top_Song)
Top_Song$Sales <- ifelse(is.na(Top_Song$Sales), mean(Top_Song$Sales, na.rm = TRUE), Top_Song$Sales)
#We decided to standardize the printing of all artist names
Top_Song$Artist <- tolower(Top_Song$Artist)
#Remove possible outliers in the data
Top_Song <- subset(Top_Song, Sales <= quantile(Sales, 0.99))Is the data (Music Dataset & Top Songs) clean enough? - To ensure that the both datasets were cleaned well and to a standard which we can then use them to do more statistical analysis we look at the sums of omitted values in each column.
## artist_name track_name release_date
## 0 0 0
## genre lyrics len
## 0 0 0
## dating violence world/life
## 0 0 0
## night/time shake the audience family/gospel
## 0 0 0
## romantic communication obscene
## 0 0 0
## music movement/places light/visual perceptions
## 0 0 0
## family/spiritual like/girls sadness
## 0 0 0
## feelings danceability loudness
## 0 0 0
## acousticness instrumentalness valence
## 0 0 0
## energy topic age
## 0 0 0
## Artist Title Year Sales Streams Downloads RadioPlays
## 0 0 0 0 0 0 0
## Rating
## 0
We call the summary of each cleaned dataset and we also assess the structure of the data now that it is renamed and all the n/a observations are cleared.
We characterize the data to make sure that all our cleaning was effective and to understand the spread of the data and observations. Characterizing the data also allows us to better understand possible comparisons between variables that we can make. This part is critical to our next steps of evidential data analysis.
#Mean release date
mean_musicdata <- mean(clean_MusicDataset$release_date, na.rm = TRUE)
#Median release date
median_musicdata <- median(clean_MusicDataset$release_date, na.rm = TRUE)
#Standard deviation of release date
sd_musicdata <- sd(clean_MusicDataset$release_date, na.rm = TRUE)
#Print findings
print(paste("The mean number of songs plublished from the Music Dataset from 1950 to 2019 is", mean_musicdata, "and the median is", median_musicdata, "with a standard deviation of", round(sd_musicdata,2),"years."))## [1] "The mean number of songs plublished from the Music Dataset from 1950 to 2019 is 1975-06-14 and the median is 1975-06-15 with a standard deviation of 18.49 years."
#Mean release date
mean_topsong <- mean(clean_topSong$Year, na.rm = TRUE)
#Median release date
median_topsong <- median(clean_topSong$Year, na.rm = TRUE)
#Standard deviation of release date
sd_topsong <- sd(clean_topSong$Year, na.rm = TRUE)
#Print findings
print(paste("The mean number of songs plublished from the Top Song dataset is the year", round(mean_topsong,2), "with the median year being", round(median_topsong,2), "with a standard deviation of", round(sd_topsong,2), "years."))## [1] "The mean number of songs plublished from the Top Song dataset is the year 1979.83 with the median year being 1981 with a standard deviation of 19.79 years."
Our main goal is to analyze how different songs provoke different emotions in listeners. This means our focus is more tailored towards the Music Dataset from 1950-2019. After we so analysis of this dataset, we wanted to look at the cross over between emotion illicit from listening to a song and the possible correlation relationship between popularity.
plot1 <- ggplot(data = clean_MusicDataset) +
geom_bar(aes(x = release_date), fill = "skyblue2") +
xlab("Date of Release") +
ylab("Count") +
ggtitle("Count of Songs Release by Year")
plot2 <- ggplot(data = clean_MusicDataset) +
geom_bar(aes(x = genre), fill = "blue4") +
xlab("Genre") +
ylab("Count") +
ggtitle("Count of Songs by Genre")
plot3 <- ggplot(data = clean_MusicDataset) +
geom_bar(aes(x = topic), fill = "blue4") +
xlab("Topic") +
ylab("Count") +
ggtitle("Count of Songs by Topic")
plot1 / (plot2 + plot3)correlation_coefficient1 <- cor(clean_MusicDataset$danceability, clean_MusicDataset$loudness)
plot4 <-ggplot(data=clean_MusicDataset) +
geom_point(mapping = aes(x = danceability, y= loudness), color = "skyblue2") +
geom_smooth(mapping = aes(x = danceability, y=loudness), color = "blue4") +
xlab("Dance Rating") +
ylab("Loudness/Noise Rating") +
ggtitle("Dance by Loud Rating", correlation_coefficient1)
plot4a <-ggplot(data = clean_MusicDataset, mapping = aes(x = danceability, y = loudness)) +
geom_point(color = "skyblue2") +
geom_smooth(method = "lm", se = FALSE, color = "blue4") +
facet_wrap(~genre) +
xlab("Dance Rating") +
ylab("Loudness/Noise Rating") +
ggtitle("Dance by Loudness Rating Across Genres")
plot4b <- ggplot(data = clean_MusicDataset, mapping = aes(x = danceability, y = loudness)) +
geom_point(color = "skyblue2") +
geom_smooth(method = "lm", se = FALSE, color = "blue4") +
facet_wrap(~topic) +
xlab("Dance Rating") +
ylab("Loudness/Noise Rating") +
ggtitle("Dance by Loudness Rating Across Topics")
plot4##
## Pearson's product-moment correlation
##
## data: clean_MusicDataset$danceability and clean_MusicDataset$loudness
## t = 6.9757, df = 28370, p-value = 3.11e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.02975787 0.05299017
## sample estimates:
## cor
## 0.04137961
The p-value falls well below the standard significance level of 0.05, meaning that we can reject the null hypothesis. There is a statistically significant relationship between danceability and loudness.
We can also see if this holds by genre:
genres <- c('blues', 'country', 'hip hop', 'jazz', 'pop', 'reggae', 'rock')
genres_d_l <- for (i in genres){
newtr <- filter(clean_MusicDataset, genre==i)
print(i)
print(cor.test(newtr$danceability, newtr$loudness))
}## [1] "blues"
##
## Pearson's product-moment correlation
##
## data: newtr$danceability and newtr$loudness
## t = -6.8433, df = 4602, p-value = 8.757e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.12888029 -0.07168797
## sample estimates:
## cor
## -0.100367
##
## [1] "country"
##
## Pearson's product-moment correlation
##
## data: newtr$danceability and newtr$loudness
## t = 4.268, df = 5443, p-value = 2.006e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03123863 0.08418628
## sample estimates:
## cor
## 0.05775307
##
## [1] "hip hop"
##
## Pearson's product-moment correlation
##
## data: newtr$danceability and newtr$loudness
## t = -5.6294, df = 902, p-value = 2.414e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2464739 -0.1204754
## sample estimates:
## cor
## -0.1842314
##
## [1] "jazz"
##
## Pearson's product-moment correlation
##
## data: newtr$danceability and newtr$loudness
## t = 12.762, df = 3843, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1711141 0.2317663
## sample estimates:
## cor
## 0.2016335
##
## [1] "pop"
##
## Pearson's product-moment correlation
##
## data: newtr$danceability and newtr$loudness
## t = 9.4732, df = 7040, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.08906863 0.13519459
## sample estimates:
## cor
## 0.112192
##
## [1] "reggae"
##
## Pearson's product-moment correlation
##
## data: newtr$danceability and newtr$loudness
## t = -11.306, df = 2496, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2577051 -0.1830838
## sample estimates:
## cor
## -0.2207174
##
## [1] "rock"
##
## Pearson's product-moment correlation
##
## data: newtr$danceability and newtr$loudness
## t = -5.122, df = 4032, p-value = 3.164e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.11098799 -0.04966551
## sample estimates:
## cor
## -0.08040283
Every p-value is far below the significance level of 0.05. We can reject the null hypothesis for each level of the variable ‘genre’, showing that there is a statistically significant relationship between loudness and danceability regardless of genre.
Next, we’ll see if it holds by topic. (Note: whereas we used a self-made genres list vector for the genres and redefined it each time, to show a different technique, for all statistics that separate by topic, we’ll use this same ‘topicsList’ variable that we got from the ‘unique()’ command.
## [1] "sadness" "world/life" "music" "romantic" "violence"
## [6] "obscene" "night/time" "feelings"
topics_d_l <- for (i in topicsList){
newtrib <- filter(clean_MusicDataset, topic==i)
print(i)
print(cor.test(newtrib$danceability, newtrib$loudness))
}## [1] "sadness"
##
## Pearson's product-moment correlation
##
## data: newtrib$danceability and newtrib$loudness
## t = 1.5103, df = 6094, p-value = 0.131
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.005762977 0.044426054
## sample estimates:
## cor
## 0.01934372
##
## [1] "world/life"
##
## Pearson's product-moment correlation
##
## data: newtrib$danceability and newtrib$loudness
## t = 5.1493, df = 5418, p-value = 2.707e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.04324330 0.09623127
## sample estimates:
## cor
## 0.06978651
##
## [1] "music"
##
## Pearson's product-moment correlation
##
## data: newtrib$danceability and newtrib$loudness
## t = 3.841, df = 2301, p-value = 0.0001258
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03910054 0.12027163
## sample estimates:
## cor
## 0.07981841
##
## [1] "romantic"
##
## Pearson's product-moment correlation
##
## data: newtrib$danceability and newtrib$loudness
## t = 6.168, df = 1522, p-value = 8.846e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1067859 0.2047694
## sample estimates:
## cor
## 0.1561618
##
## [1] "violence"
##
## Pearson's product-moment correlation
##
## data: newtrib$danceability and newtrib$loudness
## t = -9.4394, df = 5708, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1494341 -0.0983537
## sample estimates:
## cor
## -0.123976
##
## [1] "obscene"
##
## Pearson's product-moment correlation
##
## data: newtrib$danceability and newtrib$loudness
## t = 1.3178, df = 4880, p-value = 0.1876
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.00919560 0.04688909
## sample estimates:
## cor
## 0.01886158
##
## [1] "night/time"
##
## Pearson's product-moment correlation
##
## data: newtrib$danceability and newtrib$loudness
## t = 1.4773, df = 1823, p-value = 0.1398
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01132373 0.08033637
## sample estimates:
## cor
## 0.03457903
##
## [1] "feelings"
##
## Pearson's product-moment correlation
##
## data: newtrib$danceability and newtrib$loudness
## t = 0.33773, df = 610, p-value = 0.7357
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06565338 0.09282752
## sample estimates:
## cor
## 0.01367294
correlation_coefficient2 <- cor(clean_MusicDataset$sadness, clean_MusicDataset$feelings)
plot5 <- ggplot(data=clean_MusicDataset) +
geom_point(mapping = aes(x = sadness, y= feelings), color = "skyblue2") +
geom_smooth(mapping = aes(x = sadness, y=feelings), color = "blue4") +
xlab("Sadness Rating") +
ylab("Feelings Rating") +
ggtitle("Sadness by Feelings Rating", correlation_coefficient2)
plot5a <-ggplot(data = clean_MusicDataset, mapping = aes(x = sadness, y = feelings)) +
geom_point(color = "skyblue2") +
geom_smooth(method = "lm", se = FALSE, color = "blue4") +
facet_wrap(~genre) +
xlab("Sadness Rating") +
ylab("Feelings Rating") +
ggtitle("Sadness by Feelings Rating Across Genres")
plot5b <-ggplot(data = clean_MusicDataset, mapping = aes(x = sadness, y = feelings)) +
geom_point(color = "skyblue2") +
geom_smooth(method = "lm", se = FALSE, color = "blue4") +
facet_wrap(~topic) +
xlab("Sadness Rating") +
ylab("Feelings Rating") +
ggtitle("Sadness by Feelings Rating Across Topics")
plot5##
## Pearson's product-moment correlation
##
## data: clean_MusicDataset$sadness and clean_MusicDataset$feelings
## t = -11.019, df = 28370, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.07685906 -0.05368608
## sample estimates:
## cor
## -0.06528137
Here, we can see that at a significance level of 0.05, the p-value is clearly small enough to reject the null hypothesis. There is clearly a relationship between sadness and feelings ratings, and we can see from the plot that that appears to be a negative relationship (so as sadness rating goes up, feelings rating goes down).
genres <- c('blues', 'country', 'hip hop', 'jazz', 'pop', 'reggae', 'rock')
genres_s_f <- for (i in genres){
newtr <- filter(clean_MusicDataset, genre==i)
print(i)
print(cor.test(newtr$feelings, newtr$sadness))
}## [1] "blues"
##
## Pearson's product-moment correlation
##
## data: newtr$feelings and newtr$sadness
## t = -4.9104, df = 4602, p-value = 9.405e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.10087151 -0.04339856
## sample estimates:
## cor
## -0.07219496
##
## [1] "country"
##
## Pearson's product-moment correlation
##
## data: newtr$feelings and newtr$sadness
## t = -6.2112, df = 5443, p-value = 5.65e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.11020855 -0.05745745
## sample estimates:
## cor
## -0.08389177
##
## [1] "hip hop"
##
## Pearson's product-moment correlation
##
## data: newtr$feelings and newtr$sadness
## t = 1.7608, df = 902, p-value = 0.07862
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.006701953 0.123259703
## sample estimates:
## cor
## 0.05852685
##
## [1] "jazz"
##
## Pearson's product-moment correlation
##
## data: newtr$feelings and newtr$sadness
## t = -3.9591, df = 3843, p-value = 7.66e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.09515289 -0.03218941
## sample estimates:
## cor
## -0.06373458
##
## [1] "pop"
##
## Pearson's product-moment correlation
##
## data: newtr$feelings and newtr$sadness
## t = -5.102, df = 7040, p-value = 3.448e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08393334 -0.03739171
## sample estimates:
## cor
## -0.06069552
##
## [1] "reggae"
##
## Pearson's product-moment correlation
##
## data: newtr$feelings and newtr$sadness
## t = -2.9494, df = 2496, p-value = 0.003213
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.09792499 -0.01976017
## sample estimates:
## cor
## -0.05893291
##
## [1] "rock"
##
## Pearson's product-moment correlation
##
## data: newtr$feelings and newtr$sadness
## t = -4.7949, df = 4032, p-value = 1.686e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.10591272 -0.04454122
## sample estimates:
## cor
## -0.07529827
These results show us that the p-values in each genre were small enough to reject the null hypothesis in every genre except hip-hop, where there does not seem to be a statistically significant relationship between feelings ratings and sadness ratings.
Finally, we’ll see if this connection holds at each level of the variable ‘topic’:
topics_s_f <- for (i in topicsList){
newtribb <- filter(clean_MusicDataset, topic==i)
print(i)
print(cor.test(newtribb$sadness, newtribb$feelings))
}## [1] "sadness"
##
## Pearson's product-moment correlation
##
## data: newtribb$sadness and newtribb$feelings
## t = -6.0878, df = 6094, p-value = 1.214e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.10265241 -0.05274791
## sample estimates:
## cor
## -0.07774886
##
## [1] "world/life"
##
## Pearson's product-moment correlation
##
## data: newtribb$sadness and newtribb$feelings
## t = -2.46, df = 5418, p-value = 0.01393
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.059971877 -0.006784133
## sample estimates:
## cor
## -0.03340165
##
## [1] "music"
##
## Pearson's product-moment correlation
##
## data: newtribb$sadness and newtribb$feelings
## t = -0.97138, df = 2301, p-value = 0.3315
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06104093 0.02061632
## sample estimates:
## cor
## -0.02024607
##
## [1] "romantic"
##
## Pearson's product-moment correlation
##
## data: newtribb$sadness and newtribb$feelings
## t = -1.9598, df = 1522, p-value = 0.0502
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -1.001334e-01 4.085144e-05
## sample estimates:
## cor
## -0.05017247
##
## [1] "violence"
##
## Pearson's product-moment correlation
##
## data: newtribb$sadness and newtribb$feelings
## t = -1.7603, df = 5708, p-value = 0.07842
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.049201423 0.002647674
## sample estimates:
## cor
## -0.02329254
##
## [1] "obscene"
##
## Pearson's product-moment correlation
##
## data: newtribb$sadness and newtribb$feelings
## t = 0.20729, df = 4880, p-value = 0.8358
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.02508700 0.03101714
## sample estimates:
## cor
## 0.002967404
##
## [1] "night/time"
##
## Pearson's product-moment correlation
##
## data: newtribb$sadness and newtribb$feelings
## t = -2.7583, df = 1823, p-value = 0.005869
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.11002664 -0.01863764
## sample estimates:
## cor
## -0.06446731
##
## [1] "feelings"
##
## Pearson's product-moment correlation
##
## data: newtribb$sadness and newtribb$feelings
## t = -3.6585, df = 610, p-value = 0.0002757
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.22319227 -0.06806433
## sample estimates:
## cor
## -0.1465291
We can see that for the topics ‘feelings’, ‘night/time’, ‘world/life’, and ‘sadness’, the p-value is certainly below the significance level of 0.05. We can reject the null hypothesis, and conclude that for songs about ‘feelings’, ‘night/time’, ‘world/life’, and ‘sadness’, there is a statistically significant relationship between feelings ratings and sadness ratings, and we can see from the charts that this relationship seems to trend in the negative direction. However, for songs that are ‘romantic’, ‘obscene’, or about ‘violence’ or ‘music’, this relationship does not hold, as the p-values are not small enough to reject the null hypothesis at a significance level of 0.05.
correlation_coefficient3 <- cor(clean_MusicDataset$obscene, clean_MusicDataset$romantic)
plot6 <- ggplot(data=clean_MusicDataset) +
geom_point(mapping = aes(x = obscene, y= romantic), color = "skyblue2") +
geom_smooth(mapping = aes(x = obscene, y=romantic), color = "blue4") +
xlab("Obscene Rating") +
ylab("Romantic Rating") +
ggtitle("Obscene by Romantic Rating", correlation_coefficient3)
plot6a <-ggplot(data = clean_MusicDataset, mapping = aes(x = obscene, y = romantic)) +
geom_point(color = "skyblue2") +
geom_smooth(method = "lm", se = FALSE, color = "blue4") +
facet_wrap(~genre) +
xlab("Obscene Rating") +
ylab("Romantic Rating") +
ggtitle("Obscene by Romantic Rating Across Genres")
plot6b <-ggplot(data = clean_MusicDataset, mapping = aes(x = obscene, y = romantic)) +
geom_point(color = "skyblue2") +
geom_smooth(method = "lm", se = FALSE, color = "blue4") +
facet_wrap(~topic) +
xlab("Obscene Rating") +
ylab("Romantic Rating") +
ggtitle("Obscene by Romantic Rating Across Topics")
plot6##
## Pearson's product-moment correlation
##
## data: clean_MusicDataset$romantic and clean_MusicDataset$obscene
## t = -27.054, df = 28370, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1699112 -0.1472243
## sample estimates:
## cor
## -0.1585887
So we can clearly see that there is a statistically significant relationship between x, romantic rating, and y, obscenity rating, at a significance level of 0.05. We reject the null hypothesis, and can see that there is a relationship between the obscenity and romantic content of a song. Looking at the graph seems to show that this correlation is in the negative direction.
genres <- c('blues', 'country', 'hip hop', 'jazz', 'pop', 'reggae', 'rock')
genres_o_r <- for (i in genres){
newtr <- filter(clean_MusicDataset, genre==i)
print(i)
print(cor.test(newtr$obscene, newtr$romantic))
}## [1] "blues"
##
## Pearson's product-moment correlation
##
## data: newtr$obscene and newtr$romantic
## t = -8.933, df = 4602, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1588426 -0.1020526
## sample estimates:
## cor
## -0.1305547
##
## [1] "country"
##
## Pearson's product-moment correlation
##
## data: newtr$obscene and newtr$romantic
## t = -10.572, df = 5443, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1677748 -0.1157182
## sample estimates:
## cor
## -0.1418446
##
## [1] "hip hop"
##
## Pearson's product-moment correlation
##
## data: newtr$obscene and newtr$romantic
## t = -3.6814, df = 902, p-value = 0.0002457
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.18539906 -0.05691489
## sample estimates:
## cor
## -0.1216666
##
## [1] "jazz"
##
## Pearson's product-moment correlation
##
## data: newtr$obscene and newtr$romantic
## t = -12.289, df = 3843, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2246788 -0.1638469
## sample estimates:
## cor
## -0.1944498
##
## [1] "pop"
##
## Pearson's product-moment correlation
##
## data: newtr$obscene and newtr$romantic
## t = -14.205, df = 7040, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1895467 -0.1441341
## sample estimates:
## cor
## -0.1669289
##
## [1] "reggae"
##
## Pearson's product-moment correlation
##
## data: newtr$obscene and newtr$romantic
## t = -6.2413, df = 2496, p-value = 5.084e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.16239050 -0.08515715
## sample estimates:
## cor
## -0.1239616
##
## [1] "rock"
##
## Pearson's product-moment correlation
##
## data: newtr$obscene and newtr$romantic
## t = -5.5377, df = 4032, p-value = 3.259e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.11742709 -0.05617142
## sample estimates:
## cor
## -0.08688137
Finally, let’s check and see if this relationship holds across topics:
topics_o_r <- for (i in topicsList){
newtribble <- filter(clean_MusicDataset, topic==i)
print(i)
print(cor.test(newtribble$obscene, newtribble$romantic))
}## [1] "sadness"
##
## Pearson's product-moment correlation
##
## data: newtribble$obscene and newtribble$romantic
## t = -6.9552, df = 6094, p-value = 3.89e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.11359477 -0.06378213
## sample estimates:
## cor
## -0.08874394
##
## [1] "world/life"
##
## Pearson's product-moment correlation
##
## data: newtribble$obscene and newtribble$romantic
## t = -6.0051, df = 5418, p-value = 2.036e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.10770390 -0.05480861
## sample estimates:
## cor
## -0.08131351
##
## [1] "music"
##
## Pearson's product-moment correlation
##
## data: newtribble$obscene and newtribble$romantic
## t = -3.7207, df = 2301, p-value = 0.0002034
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.11780655 -0.03660361
## sample estimates:
## cor
## -0.07733333
##
## [1] "romantic"
##
## Pearson's product-moment correlation
##
## data: newtribble$obscene and newtribble$romantic
## t = -2.9124, df = 1522, p-value = 0.003639
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.12419499 -0.02432374
## sample estimates:
## cor
## -0.07444603
##
## [1] "violence"
##
## Pearson's product-moment correlation
##
## data: newtribble$obscene and newtribble$romantic
## t = -4.0754, df = 5708, p-value = 4.656e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.07969121 -0.02796440
## sample estimates:
## cor
## -0.05386394
##
## [1] "obscene"
##
## Pearson's product-moment correlation
##
## data: newtribble$obscene and newtribble$romantic
## t = -7.7194, df = 4880, p-value = 1.409e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.13746250 -0.08203416
## sample estimates:
## cor
## -0.1098337
##
## [1] "night/time"
##
## Pearson's product-moment correlation
##
## data: newtribble$obscene and newtribble$romantic
## t = -3.8451, df = 1823, p-value = 0.0001247
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.13502166 -0.04398878
## sample estimates:
## cor
## -0.08969255
##
## [1] "feelings"
##
## Pearson's product-moment correlation
##
## data: newtribble$obscene and newtribble$romantic
## t = -1.6627, df = 610, p-value = 0.09689
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1456486 0.0121511
## sample estimates:
## cor
## -0.06716876
We see from the results above that across nearly all topics, we can reject the null hypothesis, indicating that there is a statistically significant relationship between obscenity and romantic content. However, this relationship does not hold for songs about ‘feelings’.
ggplot(data = clean_MusicDataset) +
geom_boxplot(mapping = aes(x=genre, y=len), color = "blue4", fill = "skyblue2") +
xlab("Genre") +
ylab("Length of Song") +
ggtitle("Length of Song by Genre")meanset <- mean(clean_MusicDataset$len)
lens <- clean_MusicDataset%>%
group_by(genre)%>%
summarise(sds=sd(len), means=mean(len), zs=((means-meanset)/sds), probs=pnorm(zs))
lens| genre | sds | means | zs | probs |
|---|---|---|---|---|
| blues | 34.94948 | 63.70938 | -0.2666437 | 0.3948717 |
| country | 32.87547 | 62.89440 | -0.3082555 | 0.3789440 |
| hip hop | 45.59809 | 98.29204 | 0.5540493 | 0.7102275 |
| jazz | 46.17335 | 72.09675 | -0.0201782 | 0.4919506 |
| pop | 43.75638 | 78.89804 | 0.1341427 | 0.5533551 |
| reggae | 47.87912 | 98.66293 | 0.5354001 | 0.7038134 |
| rock | 36.11279 | 66.44943 | -0.1821796 | 0.4277209 |
The data is joined based on Artist name. We first need the mutate the variable name from the clean Music Dataset because the original name was artist_name. Now that both data sets have the same name for Artists, we can inner join the data. In the joined dataset we see that there are many artists with multiple songs.
#We first need to change the column name from the music dataset to match the name in the top songs dataset
clean_MusicDataset <- rename(clean_MusicDataset, Artist = artist_name)
#Inner join the two data sets based on Artist name
joined_dataset <- inner_join(clean_MusicDataset, clean_topSong, by = "Artist")
#Remove the column track name from the Music Dataset because it messes with the organization of the joined set
joined_dataset_2 <- select(joined_dataset, -track_name, -release_date)
#Rearrange the dataset to make is artist then title then year and the rest of the variables
final_joined_dataset <- joined_dataset_2 %>%
select(Artist, Title, Year, everything())
kable(head(final_joined_dataset,1))| Artist | Title | Year | genre | lyrics | len | dating | violence | world/life | night/time | shake the audience | family/gospel | romantic | communication | obscene | music | movement/places | light/visual perceptions | family/spiritual | like/girls | sadness | feelings | danceability | loudness | acousticness | instrumentalness | valence | energy | topic | age | Sales | Streams | Downloads | RadioPlays | Rating |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| frankie laine | Mule Train | 1949 | pop | believe drop rain fall grow believe darkest night candle glow believe go astray come believe believe believe smallest prayer hear believe great hear word time hear bear baby touch leaf believe believe believe lord heaven guide sin hide believe calvary die pierce believe death rise meet heaven loud amen know believe | 51 | 0.0355371 | 0.0967767 | 0.4434352 | 0.0012837 | 0.0012837 | 0.0270075 | 0.0012837 | 0.0012837 | 0.0012837 | 0.1180338 | 0.0012837 | 0.2126811 | 0.0511242 | 0.0012837 | 0.0012837 | 0.0012837 | 0.3317448 | 0.6475399 | 0.9548192 | 1.5e-06 | 0.3250206 | 0.2632403 | world/life | 1 | 12.442 | 11.18 | 0 | 1.033 | 0 |
#We then want to get an understanding of how many songs the top artists have in top songs from the music dataset.
artist_song_count <- final_joined_dataset %>%
group_by(Artist) %>%
summarise(number_of_songs = n()) %>%
arrange(desc(number_of_songs))
kable(head(artist_song_count,15))| Artist | number_of_songs |
|---|---|
| elvis presley | 3977 |
| the beatles | 2640 |
| the rolling stones | 1140 |
| madonna | 893 |
| the beach boys | 891 |
| dean martin | 876 |
| michael jackson | 868 |
| nat king cole | 832 |
| johnny cash | 760 |
| mariah carey | 728 |
| queen | 705 |
| elton john | 675 |
| abba | 589 |
| aretha franklin | 568 |
| bing crosby | 552 |
#Artist by sales
artist_sales <- final_joined_dataset %>%
group_by(Artist) %>%
summarise(average_sales = mean(Sales, na.rm = TRUE)) %>%
arrange(desc(average_sales))
kable(head(artist_sales, 15))| Artist | average_sales |
|---|---|
| joan jett & the blackhearts | 23.7390 |
| snow | 21.3440 |
| nancy sinatra | 20.7990 |
| the kingston trio | 20.2030 |
| berlin | 20.0340 |
| lynn anderson | 18.8070 |
| los lobos | 18.5460 |
| everything but the girl | 18.1670 |
| carly rae jepsen | 17.4700 |
| soft cell | 16.7840 |
| bonnie tyler | 16.7125 |
| dexys midnight runners | 16.6880 |
| gerry rafferty | 16.4060 |
| natalie imbruglia | 16.3120 |
| billy swan | 16.2540 |
#Artist by streams
artist_streams <- final_joined_dataset %>%
group_by(Artist) %>%
summarise(average_streams = mean(Streams, na.rm = TRUE)) %>%
arrange(desc(average_streams))
kable(head(artist_streams,15))| Artist | average_streams |
|---|---|
| otis redding | 18.35100 |
| carole king | 17.06900 |
| isaac hayes | 16.29800 |
| percy sledge | 15.71500 |
| the righteous brothers | 14.84933 |
| the ronettes | 14.60800 |
| joan jett & the blackhearts | 13.42200 |
| janis joplin | 13.25100 |
| bobbie gentry | 13.19000 |
| mary wells | 12.76500 |
| derek & the dominos | 12.67800 |
| carl perkins | 12.28900 |
| barbra streisand | 12.17533 |
| the box tops | 12.12350 |
| neil young | 12.10000 |
#Artist by downloads
artist_downloads <- final_joined_dataset %>%
group_by(Artist) %>%
summarise(average_downloads = mean(Downloads, na.rm = TRUE)) %>%
arrange(desc(average_downloads))
kable(head(artist_downloads,15))| Artist | average_downloads |
|---|---|
| dexys midnight runners | 14.1060 |
| nancy sinatra | 13.4350 |
| soft cell | 12.9200 |
| lynn anderson | 11.4460 |
| bonnie tyler | 11.1275 |
| pussycat | 10.8370 |
| the boomtown rats | 10.6800 |
| los lobos | 10.5590 |
| joan jett & the blackhearts | 10.5380 |
| musical youth | 10.4930 |
| norman greenbaum | 10.0220 |
| gerry rafferty | 10.0140 |
| chris de burgh | 9.7940 |
| tony orlando & dawn | 9.7760 |
| harry nilsson | 9.2705 |
#genre distribution of songs in joined data
genre_distribution1 <- final_joined_dataset %>%
group_by(genre) %>%
summarise(number_of_songs = n())
# Order by descending number of songs
genre_distribution2 <- genre_distribution1 %>%
arrange(desc(number_of_songs))
kable(genre_distribution2)| genre | number_of_songs |
|---|---|
| pop | 16720 |
| rock | 16655 |
| jazz | 5039 |
| blues | 4204 |
| country | 2768 |
| reggae | 298 |
| hip hop | 157 |
#Separate code by decade
joined_dataset_deade <- final_joined_dataset %>%
mutate(decade = floor(Year / 10) * 10)
#Group the data by decade
count_data <- joined_dataset_deade %>%
group_by(decade, genre) %>%
summarise(Count = n(), .groups = 'drop')
# Creating the count plot
ggplot(count_data, aes(x = decade, y = Count, fill = genre)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(y = "Song Count", x = "Decade", title = "Song Count by Decade and Genre") +
theme(legend.position = "top")words <- final_joined_dataset %>%
mutate(words = strsplit(as.character(lyrics), "\\s+")) %>%
unnest(words)
# Step 3 & 4: Count occurrences and arrange
top_words <- words %>%
count(words, name = "count") %>%
arrange(desc(count)) %>%
slice_head(n = 20) %>%
pull(words) # Pull the 'words' column to get just the word list
# top_words now contains the top 20 most common words
print(top_words)## [1] "know" "time" "come" "heart" "like" "go" "away" "baby" "feel"
## [10] "life" "night" "want" "yeah" "right" "live" "leave" "hold" "long"
## [19] "world" "cause"
data_aggregated <- final_joined_dataset %>%
mutate(decade = floor(Year / 10) * 10) %>%
group_by(decade) %>%
summarise(across(c(dating, violence, `world/life`, `night/time`, `shake the audience`, `family/gospel`,
romantic, communication, obscene, `family/spiritual`, `like/girls`, sadness,
feelings), ~mean(.x, na.rm = TRUE)))
# Proceed with transforming to long format
data_long <- data_aggregated %>%
pivot_longer(cols = -decade, names_to = "Feeling", values_to = "Value")
# Create the interactive plot
p <- plot_ly(data = data_long, x = ~decade, y = ~Value, color = ~Feeling, type = 'scatter', mode = 'lines+markers') %>%
layout(title = "Trends of Feelings by Decade",
xaxis = list(title = "Decade"),
yaxis = list(title = "Average Value"))
p